### Fastlink code used for merging employees to the voter file.

## (start with all the same data setup/cleanup stuff as the main merge dataset, then switch to fastlink syntax)

library(data.table) 
library(fst)

#steps 1/2 are going to be the same as the main merge for now, so will just pull in the resulting dataset created in Merge3_jan2019.R
voters <- as.data.table(read_fst("voters_nyc_only.fst"))

voters$voterhistory <- toupper(gsub(",", "", voters$voterhistory)) #first standardize text

#generate the possible indicators of having voted
options17 <- c("20171107 GE", "GE 20171107", "2017 GENERAL ELECTION", "GENERAL 2017", "GENERAL ELECTION 2017") #note this is only on the 2018 voter file. 
options16 <- c("20161108 GE", "GE 20161108", "2016 GENERAL ELECTION", "GENERAL 2016", "GENERAL ELECTION 2016")
options14 <- c("20141104 GE", "GE 20141104", "2014 GENERAL ELECTION", "GENERAL 2014", "GENERAL ELECTION 2014")
options13 <- c("20131105 GE", "GE 20131105", "2013 GENERAL ELECTION", "GENERAL 2013", "GENERAL ELECTION 2013")
options12 <- c("20121106 GE", "GE 20121106", "2012 GENERAL ELECTION", "GENERAL 2012", "GENERAL ELECTION 2012")
options10 <- c("20101102 GE", "GE 20101102", "2010 GENERAL ELECTION", "GENERAL 2010", "GENERAL ELECTION 2010")
options08 <- c("20081104 GE", "GE 20081104", "2008 GENERAL ELECTION", "GENERAL 2008", "GENERAL ELECTION 2008")
options06 <- c("2006 GENERAL ELECTION", "GENERAL 2006", "GENERAL ELECTION 2006")
options04 <- c("2004 GENERAL ELECTION", "GENERAL 2004", "GENERAL ELECTION 2004")
options02 <- c("2002 GENERAL ELECTION", "GENERAL 2002", "GENERAL ELECTION 2002")



ptm <- proc.time() 
findvotehist <- function(votehistory, optionslist){#
  histsplit <- unlist(strsplit(votehistory, ";")) #split vote history by semicolon
  overlap <- intersect(histsplit, optionslist)
  votedyear <- ifelse(length(overlap)>0, 1, 0)	
  return(votedyear)}

testrows <- voters[1:100000,]
testrows$voted16 <- sapply(testrows$voterhistory,findvotehist, options16)
proc.time() - ptm

ptm <- proc.time()
voters$voted17 <- sapply(voters$voterhistory,findvotehist, options17)
voters$voted16 <- sapply(voters$voterhistory,findvotehist, options16)
proc.time() - ptm
voters$voted14 <- sapply(voters$voterhistory,findvotehist, options14)
voters$voted13 <- sapply(voters$voterhistory,findvotehist, options13)
voters$voted12 <- sapply(voters$voterhistory,findvotehist, options12)
voters$voted10 <- sapply(voters$voterhistory,findvotehist, options10)
voters$voted08 <- sapply(voters$voterhistory,findvotehist, options08)
voters$voted06 <- sapply(voters$voterhistory,findvotehist, options06)
voters$voted04 <- sapply(voters$voterhistory,findvotehist, options04)
voters$voted02 <- sapply(voters$voterhistory,findvotehist, options02)


employeeswide <- read_feather("NycEmployeesformerge.feather")

##############
#Step 3:The Merge
##############

voters[, MidInit:= toupper(substr(middlename, 0,1))] #line up format of middle initials before merging on them.

library(fastLink) #here's where we diverge & start using fastlink
#want to merge probabilistically on first/last names, middle initials. 

#line up names.
setnames(voters, "lastname", "LastName")
setnames(voters, "firstname", "FirstName")

#start with pretty much the defaults, and maybe a subset of employees so it doesn't crash?
#employeesGROUP <- sample(1:10, nrow(employeeswide), replace=T)
#employeesample <- employeeswide[sample(1:nrow(employeeswide), 10000),]; dim(employeesample)


employees_first_letter <- tolower(substr(employeeswide$FirstName,1,1))
voters_first_letter <- tolower(substr(voters$FirstName,1,1))


rs.out <- fastLink(employeeswide[employees_first_letter=='a',], 
                   voters[voters_first_letter == 'a',], 
                   varnames=c("LastName", "FirstName", "MidInit"), 
                   stringdist.match = c("LastName", "FirstName", "MidInit"), 
                   partial.match= c("LastName", "FirstName", "MidInit"),
                   n.cores=5)


merged <- getMatches(employeeswide[employees_first_letter=='a',], 
                     voters[voters_first_letter == 'a',], rs.out)


for(letter in letters[-1]){
  rs.out <- fastLink(employeeswide[employees_first_letter==letter,], 
                     voters[voters_first_letter == letter,], 
                     varnames=c("LastName", "FirstName", "MidInit"), 
                     stringdist.match = c("LastName", "FirstName", "MidInit"), 
                     partial.match= c("LastName", "FirstName", "MidInit"),
                     n.cores=5)
  
  
  merged_new <- getMatches(employeeswide[employees_first_letter==letter,], 
                           voters[voters_first_letter == letter,], rs.out)
  
  merged <- rbind(merged, merged_new)
}

fwrite(merged, 'merged_fastlink.csv')
